GitHub

Setup

library(tidyverse)
library(httr)
library(jsonlite)
source("times.R")
source("io.R")
source("manipulate_data.R")

Direct link to dataset

### get real-time data through API
url <- "https://maps2.dcgis.dc.gov/dcgis/rest/services/DCGIS_DATA/Transportation_WebMercator/MapServer/5/query?where=1%3D1&outFields=*&outSR=4326&f=json"

# 1) use the URL to make a request from the API
data_json <- GET(url = url,
                 user_agent("Georgetown University Assignment"))
# 2) Check for a server error in the response
http_status(data_json)
## $category
## [1] "Success"
## 
## $reason
## [1] "OK"
## 
## $message
## [1] "Success: (200) OK"
# 3) get the contents of the response as a text string
data_json <- content(data_json, as = "text")
# 4) create a character matrix from the JSON
data_matrix <- fromJSON(data_json)
# 5) turn the body of the character matrix into a tibble
realtime_data <- as_tibble(data_matrix$features$attributes)
#help needed: how to use lower case for var names?

### get historical data through CSVs
cbs_data <- read_files('data/')

data <- sep_departures_from_arrivals(cbs_data) %>%
  filter_by_distance(from_station = 'lincoln_memorial', distance_m = 1600) # added filtering

hour_data <- get_station_hourly(data) %>% 
  get_historic_weather() # added weather

Visualizations by time at Lincoln Memorial

Amount of rides at the Lincoln memorial over all the time, and by various metrics

# The grouped data does not have the original date column, but it can be nice
#  for visualizations. There is a function to do this in times.R
hour_data_date <- add_date_column(hour_data)

hour_data_date %>%  # for me here there is a weird gap after July 2020, not sure why?
  ggplot() +
  geom_point(aes(x = date, y = lincoln_memorial,
                 color = type))

# by weekday
hour_data_date %>% 
  mutate(weekday = wday(date, label=TRUE)) %>% 
  ggplot() +
  geom_point(aes(x = weekday, y = lincoln_memorial,
                 color=type)) # clearly people cycle more on saturdays and sundays

# by day of the month
hour_data_date %>% 
  ggplot() +
  geom_point(aes(x = day, y = lincoln_memorial,
                 color=type)) # there is no clear pattern across days of the month - aka probably not a good predictor

# farah's code here although similar to first one here?
#hour_data_date %>%
  #mutate(weekday = wday(date, label=TRUE)) %>%
  #ggplot(aes(x = weekday, y = lincoln_memorial)) +
  #geom_point(alpha = 0.25) +
  #labs(title = "Capital bikeshare Ridership on Week days") +
  #theme_minimal()
# by hour
hour_data_date %>% 
  ggplot() +
  geom_point(aes(x = hour, y = lincoln_memorial,
                 color=type)) # people cycle more in the middle of the day, like between 1 and 7PM

hour_data_date %>% 
  ggplot() +
  geom_point(aes(x = month, y = lincoln_memorial,
                 color=type)) # people cycle most in Spring, summer and fall. SUprised that there is no big spike in summer months

hour_data_date %>% # something is off with the year variable
  ggplot() +
  geom_point(aes(x = year, y = lincoln_memorial,
                 color=type)) # overall, more people use the service in 2021 than in 2020

# for a specific month and day, seeing variance by hour
hour_data_date %>% 
  filter(year == 2021, month == "Aug", day == 15) %>% 
  ggplot() +
  geom_line(aes(x  =hour, y = lincoln_memorial,
                color=type))

Heat maps

# The below data can be used for a heat map.
coord_data <- data %>% 
  select(station, lat, lng) %>% 
  filter(duplicated(station) == FALSE)

heat_map_data <- data %>% 
  # filter() %>% Filter here if you want a certain date range
  group_by(station, type) %>% 
  summarize(count = n()) %>% 
  pivot_wider(names_from = type, values_from = count) %>% 
  left_join(coord_data) %>% 
  filter(!is.na(lat), !is.na(lng))
## `summarise()` has grouped output by 'station'. You can override using the `.groups` argument.
## Joining, by = "station"
library(sf)
## Linking to GEOS 3.8.1, GDAL 3.2.1, PROJ 7.2.1
library(tigris)
## To enable 
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
states <- tigris::states(cb = TRUE, progress_bar = FALSE) %>% 
    st_crop(xmin = -77.4, xmax = -76.8,
    ymin = 38.75, ymax = 39.15)
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
dep_sf <- heat_map_data %>% 
  st_as_sf(coords=c("lng", "lat"), remove = FALSE) %>% 
  st_set_crs(value = 4326)

dep_sf %>% 
  ggplot() + 
  geom_sf(data = states, fill = NA ) +
  geom_sf(aes(color = departure), alpha = 0.3)

dep_sf %>% 
  ggplot() + 
  geom_sf(data = states, fill = NA ) +
  geom_sf(aes(color = arrival), alpha = 0.3)

dep_sf %>% 
  ggplot() + 
  geom_sf(data = states, fill = NA ) +
  geom_sf(aes(color = departure - arrival), alpha = 0.3)

Visualizations by weather at Lincoln Memorial

<<<<<<< Updated upstream

hour_data_date %>%
  ggplot(aes(x=maximum_temperature,y=lincoln_memorial))+
  geom_point(alpha=0.07,color='yellow')+
  labs(x='Temperature', y= 'Hourly Departures and Arrivals')+
  geom_smooth(method='lm',color= 'red')
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1098 rows containing non-finite values (stat_smooth).
## Warning: Removed 1098 rows containing missing values (geom_point).

hour_data_date %>%
  ggplot(aes(x=maximum_temperature,y=lincoln_memorial))+
  geom_point(alpha=0.07,color='orange')+
  labs(x='Temperature', y= 'Hourly Departures and Arrivals')+
  geom_smooth(method='lm',color= 'red') # as the maximum temperature goes up, departures and arrivals go up
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1098 rows containing non-finite values (stat_smooth).

## Warning: Removed 1098 rows containing missing values (geom_point).

# I got this code online so still want to improve this a little bit
# also notice that there is missing data
hour_data_date %>%
  ggplot(aes(x=wind_speed,y=lincoln_memorial))+
  geom_point(alpha=0.07,color='blue')+
  labs(x='Wind Speed', y= 'Hourly Departures and Arrivals') 
## Warning: Removed 1098 rows containing missing values (geom_point).

  #geom_smooth(method='lm',color= 'red') # when I add this I get an upward trend, which is weird since it's a negative relationship?

# I got this code online so still want to improve this a little bit
# also notice that there is missing data
hour_data_date %>%
  ggplot(aes(x=precipitation,y=lincoln_memorial))+
  geom_point(alpha=0.07,color='blue')+
  labs(x='Precipiation', y= 'Hourly Departures and Arrivals') +
  geom_smooth(method='lm',color= 'red') # Negative relationship between bike usage and precipitation as expected
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1098 rows containing non-finite values (stat_smooth).
## Warning: Removed 1098 rows containing missing values (geom_point).

# definitely need to do some transformations to the rain variable as there doesn't seem to be a lot of variation?